{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2019-04-15 19:26:04--  https://nlp.stanford.edu/projects/snli/snli_1.0.zip\n",
      "Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140\n",
      "Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 94550081 (90M) [application/zip]\n",
      "Saving to: ‘snli_1.0.zip’\n",
      "\n",
      "snli_1.0.zip        100%[===================>]  90.17M  25.9MB/s    in 4.9s    \n",
      "\n",
      "2019-04-15 19:26:09 (18.4 MB/s) - ‘snli_1.0.zip’ saved [94550081/94550081]\n",
      "\n",
      "Archive:  snli_1.0.zip\n",
      "   creating: snli_1.0/\n",
      "  inflating: snli_1.0/.DS_Store      \n",
      "   creating: __MACOSX/\n",
      "   creating: __MACOSX/snli_1.0/\n",
      "  inflating: __MACOSX/snli_1.0/._.DS_Store  \n",
      " extracting: snli_1.0/Icon           \n",
      "  inflating: __MACOSX/snli_1.0/._Icon  \n",
      "  inflating: snli_1.0/README.txt     \n",
      "  inflating: __MACOSX/snli_1.0/._README.txt  \n",
      "  inflating: snli_1.0/snli_1.0_dev.jsonl  \n",
      "  inflating: snli_1.0/snli_1.0_dev.txt  \n",
      "  inflating: snli_1.0/snli_1.0_test.jsonl  \n",
      "  inflating: snli_1.0/snli_1.0_test.txt  \n",
      "  inflating: snli_1.0/snli_1.0_train.jsonl  \n",
      "  inflating: snli_1.0/snli_1.0_train.txt  \n",
      "  inflating: __MACOSX/._snli_1.0     \n"
     ]
    }
   ],
   "source": [
    "!wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip\n",
    "!unzip snli_1.0.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "keys = ['train', 'test', 'dev']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = {k:[] for k in keys}\n",
    "for k in keys :\n",
    "    for line in open('snli_1.0/snli_1.0_' + k + '.jsonl').readlines() :\n",
    "        data[k].append(json.loads(line))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from Transparency.preprocess.vectorizer import cleaner\n",
    "from tqdm import tqdm_notebook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "db342ddbd3db40958be01cc69ccb5a96",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=550152), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5d4e1fc1b3494d7daec565dc5915c137",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=550152), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "359289df3bc84e8a8c2f704f53e47da9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=550152), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d5d37588a0bf4253b000791d57e9eb35",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4c33712f2cbe40a2bcac38f31a51a976",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d964f11834f84cfba4ce8cab87bf81bb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dc99134f443f4dc1bce43d684fbfde83",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6ba773588c164cb88ac44b889f36830d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5f11f4153bd54ea2b84c718ec60fc4a7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "p, q, a = {}, {}, {}\n",
    "\n",
    "for k in keys :\n",
    "    p[k] = [cleaner(x['sentence1']) for x in tqdm_notebook(data[k]) if x['gold_label'] != '-']\n",
    "    q[k] = [cleaner(x['sentence2']) for x in tqdm_notebook(data[k]) if x['gold_label'] != '-']\n",
    "    a[k] = [cleaner(x['gold_label']) for x in tqdm_notebook(data[k]) if x['gold_label'] != '-']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "entity_list = ['neutral', 'contradiction', 'entailment']\n",
    "f = open('entity_list.txt', 'w')\n",
    "f.write(\"\\n\".join(entity_list))\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "df_paragraphs = []\n",
    "df_questions = []\n",
    "df_answers = []\n",
    "df_exp_splits = []\n",
    "\n",
    "for k in keys :\n",
    "    df_paragraphs += p[k]\n",
    "    df_questions += q[k]\n",
    "    df_answers += a[k]\n",
    "    df_exp_splits += [k] * len(p[k])\n",
    "    \n",
    "df = {'paragraph' : df_paragraphs, 'question' : df_questions, 'answer' : df_answers, 'exp_split' : df_exp_splits}\n",
    "df = pd.DataFrame(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('snli_dataset.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vocabulary size :  20981\n",
      "Found 20339 words in model out of 20981\n"
     ]
    }
   ],
   "source": [
    "%run \"../preprocess_data_QA.py\" --data_file snli_dataset.csv --output_file ./vec_snli.p --all_answers_file entity_list.txt \\\n",
    "--word_vectors_type glove.840B.300d --min_df 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
